from anndata import AnnData
import scrublet as scr
import pandas as pd
import scanpy as sc
import numpy as np
import rbcde
import scipy
import os
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.figdir = './figures-N2-clean-manifold/'
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80) # low dpi (dots per inch) yields small inline figures
Import the object from the prior notebook and kick out contaminants - trypsin populations outside clusters 5, 12 and 13. Regenerate raw form of data.
adata = sc.read('endometrium-N1-integration.h5ad')
mask = np.array([i in ['5','12','13'] for i in adata.obs['leiden']])
adata = adata[mask | (~mask & np.array(adata.obs['treatment'] == 'C'))]
adata = AnnData(adata.raw.X, var=adata.raw.var, obs=adata.obs)
Store the raw form in .raw. Filter out unexpressed genes, recompute mitochondrial percentage, then obtain log(CPM/100 + 1) form of data and store that in .X.
adata.raw = adata.copy()
sc.pp.filter_genes(adata, min_cells=3)
#convert to lower to be species agnostic: human mito start with MT-, mouse with mt-
mito_genes = [name for name in adata.var_names if name.lower().startswith('mt-')]
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
adata.obs['percent_mito'] = np.sum(
adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
sc.pp.log1p(adata)
Remove cell cycle genes by inverting the object and clustering the gene space, reporting back the cluster with known cycling genes as members.
bdata = adata.copy()
sc.pp.highly_variable_genes(bdata, min_mean=0.0125, max_mean=3, min_disp=0.5)
bdata = bdata[:, bdata.var['highly_variable']]
bdata = bdata.copy().T
sc.pp.scale(bdata, max_value=10)
sc.tl.pca(bdata, svd_solver='arpack')
sc.pl.pca_variance_ratio(bdata, log=True, save='_ccg_identification.pdf')
num_pcs = 20
sc.pp.neighbors(bdata,n_pcs=num_pcs)
sc.tl.umap(bdata)
sc.tl.leiden(bdata, resolution=0.4)
bdata.obs['known_cyclers'] = [i in ['CDK1','MKI67','CCNB2','PCNA'] for i in bdata.obs_names]
sc.pl.umap(bdata,color=['leiden','known_cyclers'],color_map='OrRd',save='_ccg_identification.pdf')
print(bdata.obs.loc[['CDK1','MKI67','CCNB2','PCNA'],'leiden'])
With the cycling genes identified, clean up the plotting folder by moving all related plots to a subfolder.
def MovePlots(plotpattern, subplotdir):
if not os.path.exists(str(sc.settings.figdir)+subplotdir):
os.makedirs(str(sc.settings.figdir)+'/'+subplotdir)
os.system('mv '+str(sc.settings.figdir)+'/*'+plotpattern+'*.* '+str(sc.settings.figdir)+'/'+subplotdir)
MovePlots('ccg_identification','ccg_identification')
Flag cell cycling genes and continue standard pipeline in a helper object until PCA.
adata.var['is_ccg'] = [i in bdata.obs[bdata.obs['leiden']=='9'].index for i in adata.var_names]
#compute HVG and PCA on separate helper object and port the results back to the original one
bdata = adata[:, ~adata.var['is_ccg']]
sc.pp.highly_variable_genes(bdata, min_mean=0.0125, max_mean=3, min_disp=0.5)
for col in ['highly_variable','means', 'dispersions', 'dispersions_norm']:
adata.var[col] = bdata.var[col]
#fill NaNs with False so that subsetting to HVGs is possible
adata.var['highly_variable'].fillna(value=False, inplace=True)
bdata = bdata[:, bdata.var['highly_variable']]
sc.pp.scale(bdata, max_value=10)
sc.tl.pca(bdata, svd_solver='arpack', n_comps=50)
adata.obsm['X_pca'] = bdata.obsm['X_pca'].copy()
adata.uns['pca'] = bdata.uns['pca'].copy()
adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, 50))
adata.varm['PCs'][adata.var['highly_variable']] = bdata.varm['PCs']
sc.pl.pca_variance_ratio(adata, log=True, save='.pdf')
Correct cross-individual batch effects with Harmony.
num_pcs = 20
pca = adata.obsm['X_pca'][:, :num_pcs]
batch = adata.obs['individual']
%load_ext rpy2.ipython
%%R -i pca -i batch -o hem
library(harmony)
library(magrittr)
#this gets "90% of the way" to determinism. better than nothing.
set.seed(1)
hem = HarmonyMatrix(pca, batch, theta=1, verbose=FALSE, do_pca=FALSE)
hem = data.frame(hem)
Compute clustering and UMAP, plot metadata, clustering and individual location UMAPs.
adata.obsm['X_harmony'] = hem.values
sc.pp.neighbors(adata, use_rep='X_harmony')
sc.tl.leiden(adata, resolution = 0.9)
sc.tl.umap(adata)
#need to set this up manually as the different metadata csvs are not fully compatible
plotmeta = ['individual','location','phase','clinical','day','type','treatment','sample']
sc.pl.umap(adata, color=plotmeta, save='.pdf')
sc.pl.umap(adata, color='leiden', save='_clustering.pdf')
sc.pl.umap(adata, color='leiden', legend_loc='on data', save='_clustering_clusnumbers.pdf')
Plot canonical markers for ease of annotation.
with open('markers.txt','r') as fid:
markers = np.unique([line.rstrip() for line in fid.readlines()])
#make sure they're in the dataset, and sort them alphabetically for ease of finding things
markers = sorted([item for item in markers if item in adata.var_names])
for i in np.arange(np.ceil(len(markers)/4)):
sc.pl.umap(adata, color=markers[int(i*4):int((i+1)*4)], use_raw=False, save='-markers'+str(int(i+1))+'.pdf', color_map='OrRd')
#goodbye clutter!
MovePlots('markers','markers')
Identify cluster markers via the rank-biserial correlation, an effect size equivalent of the Wilcoxon test. Threshold the coefficient with the literature critical value for a large effect (0.5), and possibly lower the stringency if any clusters have fewer than five markers. Write out resulting marker list to CSV.
rbcde.RBC(adata, use_raw=False)
plot_dict = {}
degs = pd.DataFrame(columns=['cluster','RBC'])
for thresh in [0.5,0.3,0.2,0.1]:
degs_temp, temp_dict = rbcde.filter_markers(adata, use_raw=False, thresh=thresh)
for clus in temp_dict:
if len(temp_dict[clus])>=5 and clus not in plot_dict:
plot_dict[clus] = temp_dict[clus]
degs = pd.concat([degs, degs_temp.loc[degs_temp['cluster']==clus,:]])
if set(plot_dict.keys()) == set(temp_dict.keys()):
break
degs.to_csv(str(sc.settings.figdir)+'/cluster_markers_rbc.csv')
Do dotplots (for top 100 in the interest of legibility, if need be) and more detailed exports.
adata_count = AnnData(np.expm1(adata.X), var=adata.var, obs=adata.obs)
for clus in plot_dict:
degs_clus = degs.loc[degs['cluster']==clus,:].copy()
degs_clus['log2_FC'] = 0
degs_clus['percent_cluster'] = 0
degs_clus['percent_rest'] = 0
adata_clus = adata_count[adata.obs['leiden']==clus]
adata_rest = adata_count[[not i for i in adata.obs['leiden']==clus]]
adata_clus = adata_clus[:, degs_clus.index]
adata_rest = adata_rest[:, degs_clus.index]
degs_clus['log2_FC'] = np.asarray(np.log2(np.mean(adata_clus.X,axis=0)/np.mean(adata_rest.X,axis=0))).reshape(-1)
#are you expressed?
adata_clus.X.data = adata_clus.X.data > 0
adata_rest.X.data = adata_rest.X.data > 0
degs_clus['percent_cluster'] = np.asarray(100*np.sum(adata_clus.X,axis=0)/adata_clus.shape[0]).reshape(-1)
degs_clus['percent_rest'] = np.asarray(100*np.sum(adata_rest.X,axis=0)/adata_rest.shape[0]).reshape(-1)
degs_clus.to_csv(str(sc.settings.figdir)+'/cluster_markers_'+clus+'.csv')
sc.pl.dotplot(adata, {clus: plot_dict[clus][:100]}, groupby='leiden', use_raw=False, save='_cluster_markers_'+clus+'.pdf')
#goodbye clutter!
MovePlots('cluster_markers','cluster_markers')
And with that, think we're good. Save the object for later.
adata.write('endometrium-N2-clean-manifold.h5ad')